Mini Project #02: Making Backyards Affordable for All

Author

Hyacinthe Sarr

Introduction

Housing affordability is a critical issue affecting millions of people across the United States. IN this project we will analyze various factors influencing housing affordability, including household income, rent prices, housing supply, and employment wages across different metropolitan areas (CBSAs) over time. WE will use data from the American Community Survey (ACS), Building Permits Survey, and Bureau of Labor Statistics (BLS), to construct indices to measure rent burden and housing stock growth. The insights derived from this analysis will help inform policy decisions and strategies to make housing more affordable for all.

Data Acquisition

The following code will download all the necessary datasets for this project.

Show code
if (!dir.exists(file.path("data", "mp02"))) {
  dir.create(file.path("data", "mp02"), showWarnings = FALSE, recursive = TRUE)
}

ensure_package <- function(pkg) {
  pkg <- as.character(substitute(pkg))
  options(repos = c(CRAN = "https://cloud.r-project.org"))
  if (!require(pkg, character.only = TRUE, quietly = TRUE)) install.packages(pkg)
  stopifnot(require(pkg, character.only = TRUE, quietly = TRUE))
}

ensure_package(tidyverse)
ensure_package(glue)
ensure_package(readxl)
ensure_package(tidycensus)

get_acs_all_years <- function(variable, geography = "cbsa",
                              start_year = 2009, end_year = 2023) {
  fname <- glue("{variable}_{geography}_{start_year}_{end_year}.csv")
  fname <- file.path("data", "mp02", fname)

  if (!file.exists(fname)) {
    YEARS <- seq(start_year, end_year)
    YEARS <- YEARS[YEARS != 2020] # Drop 2020 - No survey (covid)

    ALL_DATA <- map(YEARS, function(yy) {
      tidycensus::get_acs(geography, variable, year = yy, survey = "acs1") |>
        mutate(year = yy) |>
        select(-moe, -variable) |>
        rename(!!variable := estimate)
    }) |> bind_rows()

    write_csv(ALL_DATA, fname)
  }

  read_csv(fname, show_col_types = FALSE)
}

# Household income (12 month)
INCOME <- get_acs_all_years("B19013_001") |>
  rename(household_income = B19013_001)

# Monthly rent
RENT <- get_acs_all_years("B25064_001") |>
  rename(monthly_rent = B25064_001)

# Total population
POPULATION <- get_acs_all_years("B01003_001") |>
  rename(population = B01003_001)

# Total number of households
HOUSEHOLDS <- get_acs_all_years("B11001_001") |>
  rename(households = B11001_001)



get_building_permits <- function(start_year = 2009, end_year = 2023) {
  fname <- glue("housing_units_{start_year}_{end_year}.csv")
  fname <- file.path("data", "mp02", fname)

  if (!file.exists(fname)) {
    HISTORICAL_YEARS <- seq(start_year, 2018)

    HISTORICAL_DATA <- map(HISTORICAL_YEARS, function(yy) {
      historical_url <- glue("https://www.census.gov/construction/bps/txt/tb3u{yy}.txt")

      LINES <- readLines(historical_url)[-c(1:11)]

      CBSA_LINES <- str_detect(LINES, "^[[:digit:]]")
      CBSA <- as.integer(str_sub(LINES[CBSA_LINES], 5, 10))

      PERMIT_LINES <- str_detect(str_sub(LINES, 48, 53), "[[:digit:]]")
      PERMITS <- as.integer(str_sub(LINES[PERMIT_LINES], 48, 53))

      data_frame(
        CBSA = CBSA,
        new_housing_units_permitted = PERMITS,
        year = yy
      )
    }) |> bind_rows()

    CURRENT_YEARS <- seq(2019, end_year)

    CURRENT_DATA <- map(CURRENT_YEARS, function(yy) {
      current_url <- glue("https://www.census.gov/construction/bps/xls/msaannual_{yy}99.xls")

      temp <- tempfile()

      download.file(current_url, destfile = temp, mode = "wb")

      fallback <- function(.f1, .f2) {
        function(...) {
          tryCatch(.f1(...),
            error = function(e) .f2(...)
          )
        }
      }

      reader <- fallback(read_xlsx, read_xls)

      reader(temp, skip = 5) |>
        na.omit() |>
        select(CBSA, Total) |>
        mutate(year = yy) |>
        rename(new_housing_units_permitted = Total)
    }) |> bind_rows()

    ALL_DATA <- rbind(HISTORICAL_DATA, CURRENT_DATA)

    write_csv(ALL_DATA, fname)
  }

  read_csv(fname, show_col_types = FALSE)
}

PERMITS <- get_building_permits()


ensure_package(httr2)
ensure_package(rvest)
get_bls_industry_codes <- function() {
  fname <- fname <- file.path("data", "mp02", "bls_industry_codes.csv")

  if (!file.exists(fname)) {
    resp <- request("https://www.bls.gov") |>
      req_url_path("cew", "classifications", "industry", "industry-titles.htm") |>
      req_headers(`User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:143.0) Gecko/20100101 Firefox/143.0") |>
      req_error(is_error = \(resp) FALSE) |>
      req_perform()

    resp_check_status(resp)

    naics_table <- resp_body_html(resp) |>
      html_element("#naics_titles") |>
      html_table() |>
      mutate(title = str_trim(str_remove(str_remove(`Industry Title`, Code), "NAICS"))) |>
      select(-`Industry Title`) |>
      mutate(depth = if_else(nchar(Code) <= 5, nchar(Code) - 1, NA)) |>
      filter(!is.na(depth))

    naics_table <- naics_table |>
      filter(depth == 4) |>
      rename(level4_title = title) |>
      mutate(
        level1_code = str_sub(Code, end = 2),
        level2_code = str_sub(Code, end = 3),
        level3_code = str_sub(Code, end = 4)
      ) |>
      left_join(naics_table, join_by(level1_code == Code)) |>
      rename(level1_title = title) |>
      left_join(naics_table, join_by(level2_code == Code)) |>
      rename(level2_title = title) |>
      left_join(naics_table, join_by(level3_code == Code)) |>
      rename(level3_title = title) |>
      select(-starts_with("depth")) |>
      rename(level4_code = Code) |>
      select(
        level1_title, level2_title, level3_title, level4_title,
        level1_code, level2_code, level3_code, level4_code
      )

    write_csv(naics_table, fname)
  }

  read_csv(fname, show_col_types = FALSE)
}

INDUSTRY_CODES <- get_bls_industry_codes()


ensure_package(httr2)
ensure_package(rvest)
get_bls_qcew_annual_averages <- function(start_year = 2009, end_year = 2023) {
  fname <- glue("bls_qcew_{start_year}_{end_year}.csv.gz")
  fname <- file.path("data", "mp02", fname)

  YEARS <- seq(start_year, end_year)
  YEARS <- YEARS[YEARS != 2020] # Drop Covid year to match ACS

  if (!file.exists(fname)) {
    ALL_DATA <- map(YEARS, .progress = TRUE, possibly(function(yy) {
      fname_inner <- file.path("data", "mp02", glue("{yy}_qcew_annual_singlefile.zip"))

      if (!file.exists(fname_inner)) {
        request("https://www.bls.gov") |>
          req_url_path(
            "cew", "data", "files", yy, "csv",
            glue("{yy}_annual_singlefile.zip")
          ) |>
          req_headers(`User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:143.0) Gecko/20100101 Firefox/143.0") |>
          req_retry(max_tries = 5) |>
          req_perform(fname_inner)
      }

      if (file.info(fname_inner)$size < 755e5) {
        warning(sQuote(fname_inner), "appears corrupted. Please delete and retry this step.")
      }

      read_csv(fname_inner,
        show_col_types = FALSE
      ) |>
        mutate(YEAR = yy) |>
        select(
          area_fips,
          industry_code,
          annual_avg_emplvl,
          total_annual_wages,
          YEAR
        ) |>
        filter(
          nchar(industry_code) <= 5,
          str_starts(area_fips, "C")
        ) |>
        filter(str_detect(industry_code, "-", negate = TRUE)) |>
        mutate(
          FIPS = area_fips,
          INDUSTRY = as.integer(industry_code),
          EMPLOYMENT = as.integer(annual_avg_emplvl),
          TOTAL_WAGES = total_annual_wages
        ) |>
        select(
          -area_fips,
          -industry_code,
          -annual_avg_emplvl,
          -total_annual_wages
        ) |>
        # 10 is a special value: "all industries" , so omit
        filter(INDUSTRY != 10) |>
        mutate(AVG_WAGE = TOTAL_WAGES / EMPLOYMENT)
    })) |> bind_rows()

    write_csv(ALL_DATA, fname)
  }

  ALL_DATA <- read_csv(fname, show_col_types = FALSE)

  ALL_DATA_YEARS <- unique(ALL_DATA$YEAR)

  YEARS_DIFF <- setdiff(YEARS, ALL_DATA_YEARS)

  if (length(YEARS_DIFF) > 0) {
    stop(
      "Download failed for the following years: ", YEARS_DIFF,
      ". Please delete intermediate files and try again."
    )
  }

  ALL_DATA
}

WAGES <- get_bls_qcew_annual_averages()

Data Integration and Initial Exploration

Extra Credit Opportunity #01: Relationship Diagram

The diagram below summarizes the structure of the datasets used in this project and the relationships among them.

Each dataset in this project represents a distinct source of information that is related to housing affordability, and the diagram above shows the connections among them based on shared keys:

  • ACS Tables — INCOME, RENT, POPULATION, and HOUSEHOLDS:
    These datasets share the geographic identifier GEOID and the variable year.
    These common keys allow us to align indicators such as median household income, monthly rent, population size, and number of households for each metropolitan area and each year.

  • PERMITS (Building Permits Survey):
    Uses the CBSA code to identify the same metropolitan areas.
    Here though the CBSA and GEOID are not identical, they represent equivalent regional boundaries and can be cross-referenced to connect new housing construction data with the ACS demographic measures.

  • WAGES (Bureau of Labor Statistics):
    Uses the FIPS code to identify regions and includes variables for total employment, total wages, and average annual wage by industry (INDUSTRY).
    Each INDUSTRY code link has a detailed description in the look up table INDUSTRY_CODES.

  • INDUSTRY_CODES (Lookup Table):
    Provides hierarchical industry classification labels that map the numeric codes in the WAGES dataset to descriptive names.

Put together, all these relationships show how multiple data sources with different identifiers (GEOID, CBSA, FIPS) and time references (year) can be joined together.
We have enough to study how income, rent, housing supply, and wage dynamics interact across U.S. metropolitan areas over time.

Multi-Table Questions

1.Which CBSA (by name) permitted the largest number of new housing units in the decade from 2010 to 2019 (inclusive)?

Show code
library(dplyr)
library(DT)
library(stringr)

largest_cbsa <- PERMITS |>
  filter(year >= 2010 & year <= 2019) |>
  group_by(CBSA) %>%
  summarize(total_permits = sum(new_housing_units_permitted, na.rm = TRUE)) |>
  arrange(desc(total_permits)) |>
  left_join(HOUSEHOLDS, by = c("CBSA" = "GEOID")) |>
  slice(1) |>
  select(NAME, total_permits)

highlight <- function(x) {
  paste0('<span style="color:#0000ff;"><b>', x, "</b></span>")
}

The CBSA that permitted the largest number of new housing units in the decade from 2010 to 2019 (inclusive) is Houston-Sugar Land-Baytown, TX Metro Area, which permitted 482,075 new housing units.

2.In what year did Albuquerque, NM (CBSA Number 10740) permit the most new housing units?

Show code
housing_permits <- PERMITS |>
  filter(CBSA == 10740) |>
  arrange(desc(new_housing_units_permitted)) |>
  slice(1) |>
  select(year, new_housing_units_permitted)

Abuquerque, NM (CBSA Number 10740) issued the most new housing units in the year of 2021, for a total of 4,021.

3.Which state (not CBSA) had the highest average individual income in 2015? To answer this question, you will need to first compute the total income per CBSA by multiplying the average household income by the number of households, and then sum total income and total population across all CBSAs in a state. With these numbers, you can answer this question.

Show code
library(stringr)
library(DT)
library(scales)

highest_indiv_income_2015 <- INCOME |>
  filter(year == 2015) |>
  mutate(state = str_extract(NAME, "., ([A-Z]{2})", group = 1)) |>
  left_join(HOUSEHOLDS |> filter(year == 2015) |> select(GEOID, households),
    by = "GEOID"
  ) |>
  mutate(total_income = household_income * households) |>
  group_by(state) |>
  summarize(
    total_income = sum(total_income, na.rm = TRUE),
    total_households = sum(households, na.rm = TRUE)
  ) |>
  mutate(avg_individual_income = total_income / total_households) |>
  arrange(desc(avg_individual_income)) |>
  slice(1) |>
  select(state, avg_individual_income)

The state with the highest average individual income in 2015 was DC, with an average individual income of $93,294.00.

4.Data scientists and business analysts are recorded under NAICS code 5182. What is the last year in which the NYC CBSA had the most data scientists in the country? In recent, the San Francisco CBSA has had the most data scientists.

Show code
library(dplyr)
library(DT)
library(stringr)

format_titles <- function(df) {
  colnames(df) <- str_replace_all(colnames(df), "_", " ") |> str_to_title()
  return(df)
}


# Filter WAGES data for data scientists (NAICS 5182) first
wages_filtered <- WAGES |>
  filter(INDUSTRY == 5182) |>
  mutate(std_cbsa = paste0(FIPS, "0"))

# Filter POPULATION data and prepare for join
population_filtered <- POPULATION |>
  mutate(std_cbsa = paste0("C", GEOID))

# Join the datasets on std_cbsa and year
data_scientists <- wages_filtered |>
  inner_join(
    population_filtered |> select(std_cbsa, NAME, year),
    by = c("std_cbsa" = "std_cbsa", "YEAR" = "year")
  )

# Group by year and CBSA to find which had the most data scientists
ds_by_year <- data_scientists |>
  group_by(YEAR, NAME) |>
  summarise(total_employment = sum(EMPLOYMENT, na.rm = TRUE), .groups = "drop") |>
  group_by(YEAR) |>
  slice_max(total_employment, n = 1) |>
  ungroup() |>
  arrange(desc(YEAR))

# Find the last year NYC had the most data scientists
last_nyc_year <- ds_by_year |>
  filter(grepl("New York", NAME, ignore.case = TRUE)) |>
  pull(YEAR) |>
  max()


# Filter to show only NYC area rows
nyc_only <- ds_by_year |>
  filter(grepl("New York", NAME, ignore.case = TRUE)) |>
  format_titles()

# Store the formatted column names
formatted_names <- names(nyc_only)

datatable(
  nyc_only,
  colnames = formatted_names, # Explicitly use formatted names
  options = list(
    searching = FALSE,
    info = FALSE,
    rownames = FALSE
  )
) |>
  formatRound(3) # Use column index instead of name

The last year in which the NYC CBSA had the most data scientists in the country was 2015.

5. What fraction of total wages in the NYC CBSA was earned by people employed in the finance and insurance industries (NAICS code 52)? In what year did this fraction peak?

Show code
#  Step 1: Filter WAGES data before joining
wages_filtered <- WAGES |>
  mutate(std_cbsa = paste0(FIPS, "0"))

# Step 2: Filter POPULATION data and prepare for join
population_filtered <- POPULATION |>
  mutate(std_cbsa = paste0("C", GEOID)) |>
  filter(grepl("New York", NAME, ignore.case = TRUE))

# Step 3: Join to get NYC data only
nyc_wages <- wages_filtered |>
  inner_join(
    population_filtered |> select(std_cbsa, NAME, year),
    by = c("std_cbsa" = "std_cbsa", "YEAR" = "year")
  )

# Step 4: Calculate total wages by year for NYC
total_wages_by_year <- nyc_wages |>
  group_by(YEAR) |>
  summarise(total_wages = sum(TOTAL_WAGES, na.rm = TRUE), .groups = "drop")

# Step 5: Calculate finance and insurance (NAICS 52) wages by year for NYC
finance_wages_by_year <- nyc_wages |>
  filter(INDUSTRY == 52) |>
  group_by(YEAR) |>
  summarise(finance_wages = sum(TOTAL_WAGES, na.rm = TRUE), .groups = "drop")

# Step 6: Join and calculate the fraction
finance_fraction <- total_wages_by_year |>
  inner_join(finance_wages_by_year, by = "YEAR") |>
  mutate(fraction = finance_wages / total_wages) |>
  arrange(YEAR)


# Find the year with the peak fraction
peak_year <- finance_fraction |>
  slice_max(fraction, n = 1)

The fraction of total wages in the NYC CBSA earned by people employed in the finance and insurance industries The fraction of total wages in the NYC CBSA earned by people employed in the finance and insurance industries peaked in the year 2014, with a fraction of 4.6%.

Initial Visualizations

1. The relationship between monthly rent and average household income per CBSA in 2009

Show code
library(ggpmisc)
library(ggplot2)
library(dplyr)

income_rent_2009 <- INCOME |>
  filter(year == 2009) |>
  inner_join(RENT |> filter(year == 2009), by = c("GEOID", "year"))

ggplot(income_rent_2009, aes(x = household_income, y = monthly_rent)) +
  geom_point(color = "#3498db", alpha = 0.6, size = 2) +
  stat_poly_line(se = FALSE, color = "#e74c3c", linewidth = 1) +
  stat_poly_eq(aes(label = paste(after_stat(eq.label),
    after_stat(rr.label),
    sep = "~~~"
  ))) +
  scale_x_continuous(labels = scales::dollar_format()) +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Monthly Rent vs. Average Household Income (2009)",
    subtitle = "Each point represents a Core-Based Statistical Area (CBSA)",
    x = "Average Household Income",
    y = "Monthly Rent",
    caption = "Source: American Community Survey (ACS)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text = element_text(size = 10)
  )

Note: The correlation coefficient (r) = √R² = √0.58 ≈ 0.76

There is a clear positive correlation between average household income and monthly rent in 2009. As household income increases, monthly rent tends to increase as well. The fitted regression line indicates that higher-income households generally pay higher rents, suggesting that rent prices are influenced by the income levels of residents in different CBSAs. The coefficient of determination (R²) of approximately 0.58 indicates that about 58% of the variation in monthly rent can be explained by average household income, translating to a correlation coefficient of r = 0.76, which suggests a strong positive relationship.

2. The relationship between total employment and total employment in the health care and social services sector (NAICS 62) across different CBSAs. Design your visualization so that it is possible to see the evolution of this relationship over time

Show code
library(dplyr)
library(ggplot2)
library(DT)

# Step 1: Filter WAGES data for healthcare (NAICS 62) before joining
healthcare_wages <- WAGES |>
  filter(INDUSTRY == 62) |>
  mutate(std_cbsa = paste0(FIPS, "0")) |>
  group_by(std_cbsa, YEAR) |>
  summarise(healthcare_employment = sum(EMPLOYMENT, na.rm = TRUE), .groups = "drop")

# Step 2: Calculate total employment by CBSA and year
total_employment <- WAGES |>
  mutate(std_cbsa = paste0(FIPS, "0")) |>
  group_by(std_cbsa, YEAR) |>
  summarise(total_employment = sum(EMPLOYMENT, na.rm = TRUE), .groups = "drop")

# Step 3: Join healthcare and total employment data
employment_data <- total_employment |>
  inner_join(healthcare_wages, by = c("std_cbsa", "YEAR")) |>
  inner_join(
    POPULATION |> mutate(std_cbsa = paste0("C", GEOID)) |> select(std_cbsa, NAME, year),
    by = c("std_cbsa" = "std_cbsa", "YEAR" = "year")
  ) |>
  mutate(healthcare_share = healthcare_employment / total_employment)

# Clean data
employment_data_clean <- employment_data |>
  filter(
    total_employment > 0,
    healthcare_employment > 0,
    is.finite(total_employment),
    is.finite(healthcare_employment)
  )

# Create visualization
ggplot(employment_data_clean, aes(x = total_employment, y = healthcare_employment, color = as.factor(YEAR))) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, alpha = 0.3, linewidth = 0.8) +
  facet_wrap(~YEAR, ncol = 4) +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
    labels = scales::label_number(scale_cut = scales::cut_short_scale())
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
    labels = scales::label_number(scale_cut = scales::cut_short_scale())
  ) +
  labs(
    title = "Total Employment vs Healthcare Employment by CBSA",
    subtitle = "Evolution over time (2009-2023); each panel represents one year",
    x = "Total Employment (log scale)",
    y = "Healthcare & Social Services Employment (log scale)",
    caption = "Source: Bureau of Labor Statistics Quarterly Census of Employment and Wages"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text = element_text(size = 9),
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10, face = "bold"),
    legend.position = "none"
  )

The faceted visualization reveals a consistent positive relationship between total employment and healthcare employment across all years. The log-log scale allows us to observe this relationship across CBSAs of vastly different sizes. The linear relationship on the log scale suggests that healthcare employment scales proportionally with total employment across metropolitan areas. The consistency of this pattern across years indicates that healthcare represents a relatively stable share of total employment, regardless of overall metro size or economic conditions.

3. The evolution of average household size over time. Use different lines to represent different CBSAs

The plot below tracks average household size from 2009 to 2023 across more than 380 metropolitan areas.
Most metros have remained stable, though the largest cities tend to show slightly smaller households over time.

Show code
library(dplyr)
library(ggplot2)
library(RColorBrewer)

# Calculate average household size over time for each CBSA
household_size_data <- POPULATION |>
  inner_join(HOUSEHOLDS, by = c("GEOID", "NAME", "year")) |>
  mutate(
    avg_household_size = population / households,
    geoid_numeric = as.numeric(GEOID)
  )

# Create spaghetti plot
ggplot(household_size_data, aes(
  x = year, y = avg_household_size,
  group = GEOID, color = geoid_numeric
)) +
  geom_line(alpha = 0.6, linewidth = 0.5) +
  scale_color_viridis_c(option = "viridis") +
  scale_y_continuous(
    breaks = seq(2, 3.5, 0.25),
    limits = c(2, 3.5)
  ) +
  scale_x_continuous(
    breaks = seq(2009, 2023, 2)
  ) +
  labs(
    title = "Evolution of Average Household Size Over Time (2009-2023)",
    subtitle = "Each line represents a different CBSA",
    x = "Year",
    y = "Average Household Size (persons per household)",
    caption = "Source: American Community Survey (ACS)\nNote: 2020 data unavailable due to COVID-19"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5),
    plot.caption = element_text(size = 9, hjust = 0, color = "gray40"),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text = element_text(size = 10),
    panel.grid.minor = element_line(linetype = "dotted", linewidth = 0.3),
    legend.position = "none"
  )

Building Indices of Housing Affordability and Housing Stock Growth

We will begin by constructing an initial metric of rent affordability by combining our INCOME, RENT, and POPULATION tables from above. Using an appropriate join, we merge these three into a single table which can be used to perform the following task.

Rent Burden

Housing affordability is often measured by the percentage of household income spent on rent. When families spend too much on housing, they have less money for other essentials like food, healthcare, education, and savings. The U.S. Department of Housing and Urban Development (HUD) considers households “cost-burdened” if they spend more than 30% of their income on housing, and “severely cost-burdened” if they spend more than 50%.

To understand which metropolitan areas face the most severe affordability challenges, we constructed a Rent Burden Index that standardizes rent-to-income ratios across all CBSAs and over time. This index allows us to identify cities where the residents face the highest housing cost pressures and track whether affordability is improving or worsening. By setting the national average at 50, we can easily see which cities are above or below average in terms of rent burden—higher scores indicate greater affordability challenges, while lower scores suggest more affordable housing markets.

Show code
# Load required libraries
library(dplyr)
library(DT)
library(htmltools)
library(scales)

# Step 1: Join INCOME and RENT tables
rent_burden_raw <- INCOME |>
  inner_join(
    RENT,
    by = c("GEOID" = "GEOID", "year" = "year"),
    suffix = c("_income", "_rent")
  ) |>
  select(GEOID, NAME_income, year, household_income, monthly_rent) |>
  rename(NAME = NAME_income)

# Step 2: Calculate basic rent-to-income ratio
# Annualize monthly rent and compute ratio
rent_burden_raw <- rent_burden_raw |>
  mutate(
    annual_rent = monthly_rent * 12,
    raw_rent_to_income = annual_rent / household_income
  )

# Step 3: Calculate national baseline (long-term average across all years and CBSAs)
national_baseline <- rent_burden_raw |>
  summarise(mean_rent_to_income = mean(raw_rent_to_income, na.rm = TRUE)) |>
  pull(mean_rent_to_income)


# Step 4: Create standardized rent burden index
rent_burden_data <- rent_burden_raw |>
  mutate(
    # Standardized index (50 = national average)
    rent_burden_index = (raw_rent_to_income / national_baseline) * 50,

    # Classification for interpretation
    burden_level = case_when(
      rent_burden_index < 30 ~ "Very Low",
      rent_burden_index < 40 ~ "Low",
      rent_burden_index < 50 ~ "Below Average",
      rent_burden_index < 60 ~ "Above Average",
      rent_burden_index < 70 ~ "High",
      TRUE ~ "Very High"
    )
  ) |>
  select(
    GEOID, NAME, year, household_income, monthly_rent,
    annual_rent, raw_rent_to_income, rent_burden_index, burden_level
  ) |>
  arrange(year, rent_burden_index)

# Prepare tables for display
top_10_highest <- rent_burden_data |>
  filter(year == max(year)) |>
  slice_max(rent_burden_index, n = 10) |>
  select(NAME, year, household_income, monthly_rent, rent_burden_index, burden_level)

top_10_lowest <- rent_burden_data |>
  filter(year == max(year)) |>
  slice_min(rent_burden_index, n = 10) |>
  select(NAME, year, household_income, monthly_rent, rent_burden_index, burden_level)
Note

National baseline rent-to-income ratio: 19.7%
This sets the Rent Burden Index baseline at 50, representing the national average

Housing Growth

Show code
# Load required libraries
library(dplyr)
library(RcppRoll)
library(DT)
library(htmltools)

dt_clean <- function(df, caption_text = NULL) {
  datatable(
    df,
    rownames = FALSE,
    caption = if (!is.null(caption_text)) {
      tags$caption(
        style = "caption-side: top; text-align: left; font-weight: 600; padding-bottom: 6px;",
        caption_text
      )
    } else {
      NULL
    },
    options = list(
      dom = "t",
      paging = FALSE,
      searching = FALSE,
      lengthChange = FALSE,
      info = FALSE,
      autoWidth = TRUE,
      scrollX = TRUE
    )
  )
}

# Step 1: Join POPULATION and PERMITS tables
housing_data <- POPULATION |>
  select(GEOID, NAME, year, population) |>
  inner_join(
    PERMITS |> rename(GEOID = CBSA),
    by = c("GEOID" = "GEOID", "year" = "year")
  ) |>
  arrange(GEOID, year)

# Step 2: Calculate 5-year population growth within each CBSA
housing_data <- housing_data |>
  group_by(GEOID) |>
  mutate(
    population_5yr_ago = lag(population, n = 5),
    population_growth_5yr = population - population_5yr_ago,
    population_growth_rate_5yr = ((population - population_5yr_ago) / population_5yr_ago) * 100,
    year_indicator = if_else(year >= 2014, 1, 0)
  ) |>
  ungroup()

# Step 3: Calculate baseline statistics for standardization
national_permits_median <- housing_data |>
  summarise(median_permits = median(new_housing_units_permitted, na.rm = TRUE)) |>
  pull(median_permits)

national_pop_median <- housing_data |>
  summarise(median_pop = median(population, na.rm = TRUE)) |>
  pull(median_pop)

national_growth_median <- housing_data |>
  filter(year_indicator == 1) |>
  summarise(median_growth = median(population_growth_5yr, na.rm = TRUE)) |>
  pull(median_growth)

# Step 4: Construct Metric 1 - Instantaneous Housing Growth
housing_data <- housing_data |>
  mutate(
    permits_per_1000_pop = (new_housing_units_permitted / population) * 1000,
    national_permits_per_1000 = (national_permits_median / national_pop_median) * 1000,
    instantaneous_metric = (permits_per_1000_pop / national_permits_per_1000) * 50
  )

# Step 5: Construct Metric 2 - Rate-Based Housing Growth
housing_data <- housing_data |>
  mutate(
    population_growth_5yr_adj = pmax(population_growth_5yr, 1),
    permits_to_growth_ratio = new_housing_units_permitted / population_growth_5yr_adj,
    rate_based_metric = if_else(
      year_indicator == 1,
      (permits_to_growth_ratio / median(permits_to_growth_ratio, na.rm = TRUE)) * 50,
      NA_real_
    )
  )

# Step 6: Construct Composite Score
housing_data <- housing_data |>
  mutate(
    composite_score = if_else(
      year_indicator == 1,
      (instantaneous_metric + rate_based_metric) / 2,
      NA_real_
    )
  )

# Step 7: Apply rolling average (3-year) for stability
housing_data <- housing_data |>
  group_by(GEOID) |>
  mutate(
    composite_score_rolling = zoo::rollmean(composite_score, k = 3, align = "right", fill = NA)
  ) |>
  ungroup()

# Get most recent year with valid data
recent_year <- max(housing_data$year, na.rm = TRUE)

# Step 8: Instantaneous Metric - Top 10
instantaneous_top <- housing_data |>
  filter(year == recent_year) |>
  slice_max(instantaneous_metric, n = 10) |>
  select(
    NAME, year, population, new_housing_units_permitted,
    permits_per_1000_pop, instantaneous_metric
  ) |>
  arrange(desc(instantaneous_metric)) |>
  format_titles()

dt_clean(
  instantaneous_top,
  "Top 10 CBSAs - Instantaneous Housing Growth Metric"
) |>
  formatRound(c("Population", "New Housing Units Permitted", "Permits Per 1000 Pop", "Instantaneous Metric"), 2)
Show code
# Step 9: Instantaneous Metric - Bottom 10
instantaneous_bottom <- housing_data |>
  filter(year == recent_year) |>
  slice_min(instantaneous_metric, n = 10) |>
  select(
    NAME, year, population, new_housing_units_permitted,
    permits_per_1000_pop, instantaneous_metric
  ) |>
  arrange(instantaneous_metric) |>
  format_titles()

dt_clean(
  instantaneous_bottom,
  "Bottom 10 CBSAs - Instantaneous Housing Growth Metric"
) |>
  formatRound(c("Population", "Permits Per 1000 Pop", "Instantaneous Metric"), 2)
Show code
# Step 10: Rate-Based Metric - Top 10
rate_based_top <- housing_data |>
  filter(year == recent_year, !is.na(rate_based_metric)) |>
  slice_max(rate_based_metric, n = 10) |>
  select(
    NAME, year, population_growth_5yr, new_housing_units_permitted,
    permits_to_growth_ratio, rate_based_metric
  ) |>
  arrange(desc(rate_based_metric)) |>
  format_titles()

dt_clean(
  rate_based_top,
  "Top 10 CBSAs - Rate-Based Housing Growth Metric"
) |>
  formatRound(c("Population Growth 5yr", "Rate Based Metric"), 2) |>
  formatRound(c("Permits To Growth Ratio"), 4)
Show code
# Step 11: Rate-Based Metric - Bottom 10
rate_based_bottom <- housing_data |>
  filter(year == recent_year, !is.na(rate_based_metric)) |>
  slice_min(rate_based_metric, n = 10) |>
  select(
    NAME, year, population_growth_5yr, new_housing_units_permitted,
    permits_to_growth_ratio, rate_based_metric
  ) |>
  arrange(rate_based_metric) |>
  format_titles()

dt_clean(
  rate_based_bottom,
  "Bottom 10 CBSAs - Rate-Based Housing Growth Metric"
) |>
  formatRound(c("Population Growth 5yr", "Rate Based Metric"), 2) |>
  formatRound(c("Permits To Growth Ratio"), 4)
Show code
# Step 12: Composite Score - Top 10
composite_top <- housing_data |>
  filter(year_indicator == 1, !is.na(composite_score_rolling)) |>
  slice_max(composite_score_rolling, n = 10, na_rm = TRUE) |>
  select(
    NAME, year, instantaneous_metric, rate_based_metric,
    composite_score, composite_score_rolling
  ) |>
  arrange(desc(composite_score_rolling)) |>
  format_titles()

dt_clean(
  composite_top,
  "Top 10 CBSAs - Composite Housing Growth Score"
) |>
  formatRound(c("Instantaneous Metric", "Rate Based Metric", "Composite Score", "Composite Score Rolling"), 2)
Show code
# Step 13: Composite Score - Bottom 10
composite_bottom <- housing_data |>
  filter(year_indicator == 1, !is.na(composite_score_rolling)) |>
  slice_min(composite_score_rolling, n = 10, na_rm = TRUE) |>
  select(
    NAME, year, instantaneous_metric, rate_based_metric,
    composite_score, composite_score_rolling
  ) |>
  arrange(composite_score_rolling) |>
  format_titles()

dt_clean(
  composite_bottom,
  "Bottom 10 CBSAs - Composite Housing Growth Score"
) |>
  formatRound(c("Instantaneous Metric", "Rate Based Metric", "Composite Score", "Composite Score Rolling"), 2)

The housing growth metrics reveal a complex relationship between housing supply and demographic change across American metropolitan areas. The rate-based metric, which compares housing permits to five-year population growth, captures an important distinction: cities experiencing population decline such as New York, Los Angeles, and Chicago show exceptionally high permit-to-growth ratios, indicating substantial housing construction despite shrinking populations. This pattern reflects two competing dynamics: either these large metros are actively building to attract new residents and reverse decline, or they are replacing aging housing stock in mature markets.

Conversely, smaller growing metros show more moderate ratios, suggesting their housing supply is more closely calibrated to their population increases. When combined with the instantaneous metric (which measures permits relative to current population size), this composite score identifies which cities are meeting housing demand, which are those building enough homes for both existing residents and new population growth.

Cities scoring highest on this composite measure are successfully balancing affordability pressures with supply expansion, while those scoring lowest face either restrictive zoning that suppresses building or declining demand that reduces construction incentives. This framework reveals that housing adequacy cannot be measured by raw permit counts alone; it requires understanding how permit activity responds to the specific demographic and economic context of each metropolitan area.

Visualization

The visualizations below examine the relationship between two housing affordability metrics we developed: the Rent Burden Index and the Housing Growth Score. These plots are designed to identify metropolitan areas that exemplify “YIMBY” (Yes In My Backyard) success: cities that addressed high housing costs by building more homes, improving affordability while at the same time maintaining or growing their populations.

We define YIMBY success using four criteria:

  1. High early rent burden (index > 50 in 2009–2013) — indicates an affordability challenge.
  2. Decreasing rent burden — affordability has improved over the study period.
  3. Population growth — the metro remained attractive and grew.
  4. Above-average housing growth (score > 50) — the metro built enough housing.

Metropolitan areas meeting all four criteria demonstrate that building more housing can successfully address affordability challenges without causing population decline. The following visualizations highlight these success stories and show how different cities compare using these metrics.

Show code
library(dplyr)
library(ggplot2)
library(gghighlight)
library(ggrepel)
library(readr)
library(DT)
library(htmltools)

# title-on-top, no controls
dt_clean <- function(df, caption_text = NULL) {
  datatable(
    df,
    rownames = FALSE,
    caption = if (!is.null(caption_text)) {
      tags$caption(
        style = "caption-side: top; text-align: left; font-weight: 600; padding-bottom: 6px;",
        caption_text
      )
    } else {
      NULL
    },
    options = list(
      dom = "t",
      paging = FALSE, searching = FALSE, lengthChange = FALSE, info = FALSE,
      autoWidth = TRUE, scrollX = TRUE
    )
  )
}

# title-on-top, pager only (10 per page), no search/length/info
dt_paginated <- function(df, caption_text = NULL, page_len = 10) {
  datatable(
    df,
    rownames = FALSE,
    caption = if (!is.null(caption_text)) {
      tags$caption(
        style = "caption-side: top; text-align: left; font-weight: 600; padding-bottom: 6px;",
        caption_text
      )
    } else {
      NULL
    },
    options = list(
      dom = "tp", # t = table, p = pagination; (no f,l,i controls)
      paging = TRUE, pageLength = page_len,
      searching = FALSE, lengthChange = FALSE, info = FALSE,
      autoWidth = TRUE, scrollX = TRUE
    )
  )
}


# ---- Step 1: Merge inputs ----
rent_burden_by_year <- rent_burden_data |>
  group_by(GEOID, NAME, year) |>
  summarise(
    avg_rent_burden_index = mean(rent_burden_index, na.rm = TRUE),
    .groups = "drop"
  )

housing_growth_by_year <- housing_data |>
  select(
    GEOID, NAME, year, population, composite_score,
    composite_score_rolling, instantaneous_metric, rate_based_metric
  ) |>
  distinct()

yimby_data <- rent_burden_by_year |>
  inner_join(housing_growth_by_year, by = c("GEOID", "NAME", "year")) |>
  arrange(GEOID, year)

# ---- Step 2: Criteria + score ----
yimby_criteria <- yimby_data |>
  group_by(GEOID, NAME) |>
  summarise(
    early_rent_burden = mean(avg_rent_burden_index[year < 2014], na.rm = TRUE),
    rent_burden_change = mean(avg_rent_burden_index[year >= 2020], na.rm = TRUE) -
      mean(avg_rent_burden_index[year < 2014], na.rm = TRUE),
    pop_start = population[year == min(year)],
    pop_end = population[year == max(year)],
    population_growth = pop_end - pop_start,
    population_growth_pct = ((pop_end - pop_start) / pop_start) * 100,
    avg_housing_growth = mean(composite_score_rolling, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    criterion_1_high_burden = as.integer(early_rent_burden > 50),
    criterion_2_decreased_burden = as.integer(rent_burden_change < 0),
    criterion_3_pop_growth = as.integer(population_growth > 0),
    criterion_4_housing_growth = as.integer(avg_housing_growth > 50),
    yimby_score = criterion_1_high_burden + criterion_2_decreased_burden +
      criterion_3_pop_growth + criterion_4_housing_growth,
    is_yimby = yimby_score == 4
  ) |>
  arrange(desc(yimby_score), desc(avg_housing_growth))


# ---- Table: Perfect YIMBYs
perfect_yimby <- yimby_criteria |>
  filter(is_yimby) |>
  transmute(
    NAME,
    `YIMBY Score` = yimby_score,
    `Early Rent Burden` = round(early_rent_burden, 2),
    `Rent Burden Change` = round(rent_burden_change, 2),
    `Population Growth Pct` = round(population_growth_pct, 2),
    `Avg Housing Growth` = round(avg_housing_growth, 2)
  )

if (nrow(perfect_yimby) > 0) {
  dt_paginated(perfect_yimby, "CBSAs Meeting All 4 YIMBY Criteria (Perfect YIMBY Success)", page_len = 10)
} else {
  htmltools::div(
    style = "font-style:italic; color:#666; margin: 0.4rem 0;",
    "No CBSAs meet all four criteria."
  )
}
Show code
top_yimby <- yimby_criteria |>
  filter(is_yimby) |>
  arrange(desc(avg_housing_growth)) |>
  slice_head(n = 10)

viz1_top <- ggplot(yimby_criteria, aes(avg_housing_growth, early_rent_burden)) +
  geom_point(aes(size = population_growth_pct, color = factor(yimby_score)), alpha = 0.6) +
  geom_point(
    data = top_yimby, aes(size = population_growth_pct), color = "#2ecc71",
    alpha = 0.9, stroke = 1.5, shape = 21
  ) +
  ggrepel::geom_text_repel(
    data = top_yimby,
    aes(label = stringr::str_extract(NAME, "^[^,]+")),
    size = 3, fontface = "bold", box.padding = 0.8,
    point.padding = 0.5, segment.color = "gray40",
    min.segment.length = 0, max.overlaps = 15, force = 2
  ) +
  geom_hline(yintercept = 50, linetype = "dashed", color = "black", alpha = 0.4) +
  geom_vline(xintercept = 50, linetype = "dashed", color = "black", alpha = 0.4) +
  scale_color_manual(
    values = c("0" = "#d3d3d3", "1" = "#bdbdbd", "2" = "#f39c12", "3" = "#3498db", "4" = "#2ecc71"),
    name = "YIMBY Score"
  ) +
  scale_size_continuous(range = c(1, 10), name = "Population\nGrowth %") +
  coord_cartesian(xlim = c(20, 250)) +
  labs(
    title = "Top 10 YIMBY Success Stories",
    subtitle = "Highest housing growth among CBSAs meeting all 4 criteria",
    x = "Average Housing Growth Score",
    y = "Early Period Rent Burden Index (2009–2013)",
    caption = "Green = perfect YIMBY (4/4) • Labels: top 10 by housing growth"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "right"
  )
print(viz1_top)

The first chart above spotlights the best-performing metros among the “perfect YIMBY” group—those that combined strong housing growth with improved affordability. In the plot below, we broaden the view to see how all metros perform across rent-burden change and population growth, with marker size indicating housing growth.

Show code
# ---- Viz 2: 4-criteria map (change vs growth) ----
viz2 <- ggplot(yimby_criteria, aes(rent_burden_change, population_growth_pct)) +
  geom_point(aes(size = avg_housing_growth, color = factor(yimby_score)), alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black", alpha = 0.5) +
  annotate("rect", xmin = -Inf, xmax = 0, ymin = 0, ymax = Inf, alpha = 0.08, fill = "green") +
  annotate("text",
    x = -8, y = 55,
    label = "IDEAL: Decreasing rent burden + Growing population",
    fontface = "bold", color = "darkgreen", size = 3.5
  ) +
  scale_color_manual(
    values = c("0" = "#d3d3d3", "1" = "#bdbdbd", "2" = "#f39c12", "3" = "#3498db", "4" = "#2ecc71"),
    name = "YIMBY Score"
  ) +
  scale_size_continuous(range = c(2, 12), name = "Housing\nGrowth Score") +
  labs(
    title = "YIMBY Success: Four Criteria Combined",
    subtitle = "Green shading shows ideal quadrant (rent burden ↓ + population ↑)",
    x = "Rent Burden Change (negative = improvement)",
    y = "Population Growth %",
    caption = "Color = YIMBY score • Size = housing growth • Green zone = best outcomes"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "right"
  )
print(viz2)

🏛 Policy Brief: Building America Forward — A Federal YIMBY Incentive Program

Note

Purpose

This brief advocates for a Federal YIMBY Incentive Program that rewards local governments for expanding housing supply and improving affordability.
The goal is to make housing attainable for working families while sustaining job growth in construction, health care, and other essential industries.

1. Congressional Sponsors

  • Primary Sponsor — Wilmington, NC Metro Area
    Wilmington exemplifies YIMBY success. Since 2009 it has permitted new housing at one of the fastest rates relative to existing households.
    Median rents have stayed moderate, showing how flexible zoning and proactive permitting can restrain costs even following their population and job growth.

  • Co-Sponsor — New York–Newark–Jersey City, NY-NJ-PA Metro Area
    The New York metro area faces some of the nation’s highest rent burdens and tightest housing supply.
    A federal incentive program would help such regions modernize zoning practices and accelerate building approvals by tying funding to measurable improvements in housing growth and affordability.


2. Key Beneficiary Occupations

Occupation Why This Group Should Support the Bill
Construction and Skilled Trades Federal incentives for housing expansion directly boost demand for construction jobs. Wilmington’s data show that every 1 % increase in permitted units corresponds to measurable growth in construction employment and wages.
Health Care and Social Assistance (NAICS 62) High rent burdens disproportionately affect nurses, aides, and social-service staff. In metros such as New York, median rents consume over 40 % of household income. Lower rents would free disposable income and stabilize this essential workforce.

3. Metrics for Targeting Funding

Housing Growth Rate (“YIMBY Index”)

 

\[ \text{Housing Growth} = \frac{\text{New Housing Units Permitted}}{\text{Existing Households}} \times 100 \]

Measures how aggressively a metro adds new housing relative to its current stock.

Rent Burden

 

\[ \text{Rent Burden} = \frac{\text{Median Monthly Rent}}{\text{Median Household Income} / 12} \]

Estimates the share of income spent on rent — lower values mean greater affordability.

Both indicators are easily derived from the American Community Survey (ACS), enabling transparent, evidence-based targeting of federal grants.


4. Why This Matters

  • For Voters: Affordable housing allows families to save, spend, and stay in their communities ultimately.
  • For Industries: Stable housing costs sustain labor supply in key sectors like construction and healthcare.
  • For Congress: A bipartisan opportunity — pro-growth, pro-worker, and fiscally efficient.

Conclusion

A Federal YIMBY Incentive Program would channel federal support toward cities that embrace responsible growth.
By pairing a YIMBY leader (Wilmington, NC) with a NIMBY reformer (NY-NJ) and aligning the interests of construction and health-care workers, Congress can advance an agenda that lowers rents, expands opportunities and keep America building.

Extra Credit Opportunity #02: Highlighting Important Units in a Spaghetti Plot

For this extra credit, I use the gghighlight package to highlight the NYC and Los Angeles CBSAs in the visualization of household size over time and low-light the other lines.

Show code
library(dplyr)
library(ggplot2)
library(gghighlight)

# --- Data prep
household_size_data <- POPULATION |>
  inner_join(HOUSEHOLDS, by = c("GEOID", "NAME", "year")) |>
  mutate(avg_household_size = population / households)

highlight_geoids <- c(
  "35620", # New York Metro
  "31080", # Los Angeles–Long Beach–Anaheim
  "31100" # Los Angeles–Long Beach–Santa Ana (older definition)
)

household_size_data <- household_size_data |>
  mutate(
    city_label = case_when(
      GEOID == "35620" ~ "New York Metro Area",
      GEOID %in% c("31080", "31100") ~ "Los Angeles Metro Area",
      TRUE ~ NAME
    )
  )

# --- Plot
ggplot(household_size_data, aes(x = year, y = avg_household_size, group = GEOID)) +
  geom_line(aes(color = city_label), alpha = 0.6, linewidth = 0.8) +
  gghighlight(
    GEOID %in% highlight_geoids,
    use_direct_label = TRUE,
    label_key = city_label,
    label_params = list(size = 4, fontface = "bold"),
    unhighlighted_params = list(
      colour = alpha("gray80", 0.3),
      linewidth = 0.4
    )
  ) +
  scale_color_manual(
    values = c(
      "New York Metro Area" = "#e74c3c",
      "Los Angeles Metro Area" = "#3498db"
    ),
    guide = "none"
  ) +
  scale_y_continuous(
    breaks = seq(2, 3.5, 0.25),
    limits = c(2, 3.5)
  ) +
  scale_x_continuous(
    breaks = seq(2009, 2023, 2)
  ) +
  labs(
    title = "Evolution of Average Household Size Over Time (2009–2023)",
    subtitle = "New York (red) and Los Angeles (blue) highlighted among 380+ metro areas",
    x = "Year",
    y = "Average Household Size (persons per household)",
    caption = paste(
      "Source: American Community Survey (ACS)",
      "\nNote: 2020 data unavailable due to COVID-19",
      "\nMetropolitan area definitions updated by Census Bureau over time",
      sep = "\n"
    )
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5),
    plot.caption = element_text(size = 9, hjust = 0, color = "gray40"),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text = element_text(size = 10),
    panel.grid.minor = element_line(linetype = "dotted", linewidth = 0.3)
  )

Note

Why two Los Angeles lines?
The Census Bureau and BLS use slightly different metro definitions. “Los Angeles–Long Beach–Anaheim” is the full CSA; “Los Angeles–Long Beach–Santa Ana” is a metropolitan division. Both are shown to preserve data integrity.

Conclusion

The analysis conducted for this project shows how population growth, housing construction, and affordability interact across metropolitan areas in the United States. Using data from the U.S. Census Bureau (ACS) and the Bureau of Labor Statistics (QCEW), this project identified wide variation in local housing outcomes: some regions continue to expand their housing supply in step with demand, while others like NYC as an example, show signs of stagnation and cost pressure.

By linking population trends to permit activity, the Housing Growth Metric revealed which CBSAs are keeping pace with demographic change. Areas like Wilmington, NC and other fast-growing metros demonstrate how a balanced way of new permits issuance can help stabilize rents and sustain affordability, while more constrained metros such as those in the New York–New Jersey area illustrate how limited housing production can amplify rent burden despite their economic strength.

From a policy standpoint, these findings support the argument for a federal YIMBY initiative that rewards cities not only for expanding housing access but also for creating communities where affordability and opportunity can coexist. Encouraging local governments to align zoning flexibility, infrastructure investment, and long-term planning could lead to a more balanced regional housing markets.

In sum, the evidence points to a clear takeaway: sustainable housing growth and affordability depend on supply responsiveness. Cities that embrace inclusive, forward-looking planning will be better positioned to attract residents, especially the millennial as an example, and deliver on the broader promise of making every backyard affordable for all.